The following material provides an overview of a basic analysis of single cell transcriptome data. The material is largely derived (sometimes verbatim) from the following sources:

  1. Seurat2: http://satijalab.org/seurat/
  2. Pagoda2: https://github.com/hms-dbmi/pagoda2
  3. Shekhar et al. https://github.com/broadinstitute/BipolarCell2016

Data set choice:

##################################################
## ********* USER DEFINED SECTION ***************
##################################################

## Data set options:   un-comment just one of thhe 'counts_matrix_filename' options.

##########################################
# **  human peripheral blood mononuclear cells from: https://support.10xgenomics.com/single-cell-gene-expression/datasets
#counts_matrix_filename = "data/pbmcs/pbmc3k.counts.matrix.gz"; org="human"

##############################
# ** mouse retinal bipolar cells, from Shekhar et al. Cell 2016
#counts_matrix_filename = "data/retinal_bipolar_cells/retinal_bipolar.dat.gz"; org="mouse"
load("5k_pbmc_protein_v3_filtered_feature_bc_matrix.h5.RData")
## Pagoda variable settings - depend on organism type
if (org == "human") {
  # get_goSets_func = p2.generate.human.go.web
  # get_go_env = p2.generate.human.go
  suppressMessages(library(org.Hs.eg.db))
  ALIAS2EG = org.Hs.egALIAS2EG
} else if (org == "mouse") {
  # get_goSets_func = p2.generate.mouse.go.web
  # get_go_env = pagoda2:::p2.generate.mouse.go
  suppressMessages(library(org.Mm.eg.db))
  ALIAS2EG = org.Mm.egALIAS2EG
} else {
  stop("Error, not sure what organism we're using")
}

Data preparation

# Read data from your file, rows as genes colums as cells
# myCountMatrix <- read.table(gzfile(counts_matrix_filename), header=T, row.names=1)

myCountMatrix <- as.matrix(assays(scEx)[["counts"]])

Look at the matrix:

myCountMatrix[1:10, 1:3]
##            AAACCCAAGAGACAAG-1 AAACCCAAGGCCTAGA-1 AAACCCAGTCGTGCCA-1
## AL627309.1                  0                  0                  0
## AL669831.5                  0                  0                  0
## FAM87B                      0                  0                  0
## LINC00115                   0                  0                  0
## FAM41C                      0                  0                  0
## AL645608.3                  0                  0                  0
## SAMD11                      0                  0                  0
## NOC2L                       0                  0                  1
## KLHL17                      0                  0                  0
## PLEKHN1                     0                  0                  0

How big is the matrix?

dim(myCountMatrix) # report num rows and cols
## [1] 20824  5247

Size in bytes?

object.size(myCountMatrix)
## 875951744 bytes

Convert the matrix to a sparse matrix

myCountMatrixSparse <- Matrix(as.matrix(myCountMatrix), sparse = T)

# take a look at it:
myCountMatrixSparse[1:10,1:3]
## 10 x 3 sparse Matrix of class "dgCMatrix"
##            AAACCCAAGAGACAAG-1 AAACCCAAGGCCTAGA-1 AAACCCAGTCGTGCCA-1
## AL627309.1                  .                  .                  .
## AL669831.5                  .                  .                  .
## FAM87B                      .                  .                  .
## LINC00115                   .                  .                  .
## FAM41C                      .                  .                  .
## AL645608.3                  .                  .                  .
## SAMD11                      .                  .                  .
## NOC2L                       .                  .                  1
## KLHL17                      .                  .                  .
## PLEKHN1                     .                  .                  .
# check dimensions:
dim(myCountMatrixSparse)
## [1] 20824  5247
# check size:
object.size(myCountMatrixSparse)
## 112964464 bytes
# size reduction:
object.size(myCountMatrixSparse) / object.size(myCountMatrix)
## 0.1 bytes
# Remove the original matrix to reduce memory usage
rm(myCountMatrix)
myCountMatrixSparse.prefiltered = myCountMatrixSparse # store just in case

Filtering ‘bad’ cells

Look at the summary counts

#par(mfrow=c(1,2), mar = c(3.5,3.5,2.0,0.5), mgp = c(2,0.65,0), cex = 1.0)

reads_per_cell = Matrix::colSums(myCountMatrixSparse)
reads_per_gene = Matrix::rowSums(myCountMatrixSparse)
genes_per_cell = Matrix::colSums(myCountMatrixSparse>0) # count gene only if it has non-zero reads mapped.
cells_per_gene = Matrix::rowSums(myCountMatrixSparse>0) # only count cells where the gene is expressed

hist(log10(reads_per_cell+1),main='reads per cell',col='wheat')

hist(log10(genes_per_cell+1), main='genes per cell', col='wheat')

plot(reads_per_cell, genes_per_cell, log='xy', col='wheat')

hist(log10(reads_per_gene+1),main='reads per gene',col='wheat')

Plot genes per cell with cells ranked accordingly.

plot(sort(genes_per_cell), xlab='cell', log='y', main='genes per cell (ordered)')

Cell filtering criteria: define min and max genes per cell

##################################################
## ********* USER DEFINED SECTION ***************
##################################################

#  set upper and lower thresholds for genes per cell:
MIN_GENES_PER_CELL = 350  ## user-defined setting
MAX_GENES_PER_CELL = 1800  ## user-defined setting

# now replot with the thresholds being shown:
plot(sort(genes_per_cell), xlab='cell', log='y', main='genes per cell (ordered)')
abline(h=MIN_GENES_PER_CELL, col='green')  # lower threshold
abline(h=MAX_GENES_PER_CELL, col='green') # upper threshold

Examine percent mitochondrial read content

# define the mitochondrial genes
mito_genes = grep("^mt-", rownames(myCountMatrixSparse) , ignore.case=T, value=T)
print(mito_genes)
##  [1] "MT-ND1"  "MT-ND2"  "MT-CO1"  "MT-CO2"  "MT-ATP8" "MT-ATP6" "MT-CO3" 
##  [8] "MT-ND3"  "MT-ND4L" "MT-ND4"  "MT-ND5"  "MT-ND6"  "MT-CYB"
# compute pct mito
mito_gene_read_counts = Matrix::colSums(myCountMatrixSparse[mito_genes,])
pct_mito = mito_gene_read_counts / reads_per_cell * 100
plot(sort(pct_mito))

Decide on maximum allowed percent mitochondrial reads:

##################################################
## ********* USER DEFINED SECTION ***************
##################################################

MAX_PCT_MITO = 10   ## user-defined setting

plot(sort(pct_mito))
abline(h=MAX_PCT_MITO, col='red')

cell selection as per Peter Karchenko - the Pagoda way

df = data.frame(reads_per_cell=reads_per_cell, genes_per_cell=genes_per_cell)
head(df)
##                    reads_per_cell genes_per_cell
## AAACCCAAGAGACAAG-1           7373           2362
## AAACCCAAGGCCTAGA-1           3771           1258
## AAACCCAGTCGTGCCA-1           4902           1578
## AAACCCATCGTGCATA-1           6704           1908
## AAACGAAAGACAAGCC-1           3899           1588
## AAACGAAAGAGTGACC-1          11780           3135

Plot gene_per_cell vs. reads_per_cell, define outliers

df = df[order(df$reads_per_cell),] # order by reads_per_cell
plot(df, log='xy')
m <- rlm(genes_per_cell~reads_per_cell,data=df) # robust linear model, not sens to outliers
p.level = 1e-3
# predict genes_per_cell based on observed reads_per_cell
suppressWarnings(pb <- data.frame(predict(m, interval='prediction', 
                                          level = 1-p.level, # define conf interval
                                          type="response")))
polygon(c(df$reads_per_cell, rev(df$reads_per_cell)),
        c(pb$lwr, rev(pb$upr)), col=adjustcolor(2,alpha=0.1), border = NA)

# identifier outliers as having observed genes_per_cell outside the prediction confidence interval
outliers <- rownames(df)[df$genes_per_cell > pb$upr | df$genes_per_cell < pb$lwr];
points(df[outliers,],col=2,cex=0.6)

Now, actually filter out ‘bad’ cells (and genes)

myCountMatrixSparse = myCountMatrixSparse.prefiltered # just in case we re-run this block using different thresholds.

###############################################################
# prune genes, require a gene to be expressed in at least 3 cells

myCountMatrixSparse.prefiltered = myCountMatrixSparse
myCountMatrixSparse = myCountMatrixSparse[cells_per_gene >= 3,]  ## user can change this if needed.

###############################################################
# prune cells
valid_cells = colnames(myCountMatrixSparse) # all cells
message('starting with: ', length(valid_cells), ' cells') # number starting with
## starting with: 5247 cells
## remove cells based on gene count criteria:
valid_cells = valid_cells[genes_per_cell >= MIN_GENES_PER_CELL & genes_per_cell <= MAX_GENES_PER_CELL]  # set values based on your evaluation above
message('after filtering low and high gene count outliers: ', length(valid_cells), ' cells') # number after filtering based gene count thresholds
## after filtering low and high gene count outliers: 2973 cells
## remove cells having excessive mito read content
valid_cells = valid_cells[valid_cells %in% names(pct_mito)[pct_mito <= MAX_PCT_MITO]]
message('after removing high-mito cells: ', length(valid_cells), ' cells') # number remaining after high-mito cells removed
## after removing high-mito cells: 1483 cells
## remove cells identified as outliers via the Karchenko method
valid_cells = valid_cells[ ! valid_cells %in% outliers]
message('after removing final outliers: ', length(valid_cells), ' cells') # number surviving outlier detection
## after removing final outliers: 1483 cells
## update the count matrix to contain only the valid cells
myCountMatrixSparse = myCountMatrixSparse[,valid_cells]
pbmc <- CreateSeuratObject(counts = assays(scEx)[["counts"]], project = "pbmc3k", min.cells = 3, min.features = 200)
pbmc
## An object of class Seurat 
## 17787 features across 5197 samples within 1 assay 
## Active assay: RNA (17787 features)

examine contents of seurat2obj

pbmc[["percent.mt"]] <- PercentageFeatureSet(pbmc, pattern = "^MT-")
# Visualize QC metrics as a violin plot
VlnPlot(pbmc, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)

# FeatureScatter is typically used to visualize feature-feature relationships, but can be used
# for anything calculated by the object, i.e. columns in object metadata, PC scores etc.

plot1 <- FeatureScatter(pbmc, feature1 = "nCount_RNA", feature2 = "percent.mt")
plot2 <- FeatureScatter(pbmc, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
CombinePlots(plots = list(plot1, plot2))

# save original data before subsetting
pbmcOrg = pbmc
pbmc <- subset(pbmcOrg, subset = nFeature_RNA > 200 & nFeature_RNA < 5000 & percent.mt < 16)
plot1 <- FeatureScatter(pbmc, feature1 = "nCount_RNA", feature2 = "percent.mt")
plot2 <- FeatureScatter(pbmc, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
CombinePlots(plots = list(plot1, plot2))

pbmc
## An object of class Seurat 
## 17787 features across 4084 samples within 1 assay 
## Active assay: RNA (17787 features)
pbmc <- NormalizeData(pbmc, normalization.method = "LogNormalize", scale.factor = 10000)
pbmc <- NormalizeData(pbmc)
pbmc <- FindVariableFeatures(pbmc, selection.method = "vst", nfeatures = 2000)

# Identify the 10 most highly variable genes
top10 <- head(VariableFeatures(pbmc), 10)

# plot variable features with and without labels
plot1 <- VariableFeaturePlot(pbmc)
plot2 <- LabelPoints(plot = plot1, points = top10, repel = TRUE)
## Warning: Using `as.character()` on a quosure is deprecated as of rlang 0.3.0.
## Please use `as_label()` or `as_name()` instead.
## This warning is displayed once per session.
## When using repel, set xnudge and ynudge to 0 for optimal results
CombinePlots(plots = list(plot1, plot2))
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Transformation introduced infinite values in continuous x-axis

all.genes <- rownames(pbmc)
pbmc <- ScaleData(pbmc, features = all.genes)
## Centering and scaling data matrix
pbmc <- RunPCA(pbmc, features = VariableFeatures(object = pbmc))
## PC_ 1 
## Positive:  CD3D, TRAC, LTB, TRBC2, CD3G, IL32, IL7R, CD69, CD247, TRBC1 
##     CD2, CD7, CD27, TCF7, ISG20, ARL4C, SPOCK2, HIST1H4C, BCL11B, GZMM 
##     SYNE2, ITM2A, CCR7, RORA, MAL, CXCR4, LEF1, TRAT1, TRABD2A, CTSW 
## Negative:  LYZ, FCN1, CST3, MNDA, CTSS, PSAP, FGL2, S100A9, AIF1, GRN 
##     LST1, NCF2, CD68, TYMP, SERPINA1, CYBB, CLEC12A, CSTA, TNFAIP2, SPI1 
##     FTL, CPVL, MPEG1, TYROBP, VCAN, KLF4, IGSF6, S100A8, MS4A6A, CD14 
## PC_ 2 
## Positive:  NKG7, CST7, GZMA, PRF1, KLRD1, CTSW, FGFBP2, GNLY, GZMH, CCL5 
##     GZMM, CD247, KLRF1, HOPX, SPON2, ADGRG1, TRDC, MATK, GZMB, FCGR3A 
##     S100A4, EFHD2, CCL4, CLIC3, KLRB1, TBX21, IL2RB, TTC38, PTGDR, ANXA1 
## Negative:  CD79A, MS4A1, BANK1, IGHM, HLA-DQA1, IGKC, CD79B, LINC00926, RALGPS2, TNFRSF13C 
##     VPREB3, SPIB, IGHD, CD22, HLA-DQB1, FCRL1, BLK, FAM129C, GNG7, TCF4 
##     FCRLA, TCL1A, COBLL1, PAX5, P2RX5, BCL11A, CD40, SWAP70, TSPAN13, ARHGAP24 
## PC_ 3 
## Positive:  GZMB, NKG7, GNLY, CLIC3, PRF1, KLRD1, FGFBP2, KLRF1, SPON2, CST7 
##     GZMH, FCGR3A, ADGRG1, GZMA, HOPX, TRDC, CTSW, CCL4, TTC38, HLA-DPB1 
##     PLAC8, C12orf75, PLEK, APOBEC3G, TBX21, PRSS23, MATK, CYBA, SYNGR1, CXXC5 
## Negative:  IL7R, MAL, TCF7, LEF1, TRABD2A, TRAC, CCR7, CD27, FOS, LTB 
##     FHIT, LRRN3, RGCC, TRAT1, CAMK4, RGS10, BCL11B, CD3D, CD40LG, AQP3 
##     FOSB, SOCS3, CD3G, FLT3LG, VIM, TNFRSF25, SLC2A3, INPP4B, TSHZ2, MYC 
## PC_ 4 
## Positive:  MS4A1, CD79A, LINC00926, BANK1, TNFRSF13C, VPREB3, CD79B, RALGPS2, IGHD, FCRL1 
##     BLK, IGHM, CD22, PAX5, ARHGAP24, P2RX5, CD24, S100A12, NCF1, CD19 
##     VNN2, SWAP70, TNFRSF13B, RBP7, FCRLA, S100A8, FCER2, FCRL2, POU2AF1, OSBPL10 
## Negative:  PLD4, FCER1A, SERPINF1, IL3RA, LILRA4, TPM2, GAS6, CLEC10A, CLEC4C, SMPD3 
##     ENHO, ITM2C, SCT, FLT3, P2RY14, PROC, LAMP5, LGMN, PPP1R14B, AC119428.2 
##     PPM1J, PACSIN1, CD1C, RUNX2, DNASE1L3, PTCRA, UGCG, KCNK17, DERL3, SCN9A 
## PC_ 5 
## Positive:  TPM2, LILRA4, CLEC4C, S100A12, SMPD3, IL3RA, DERL3, MZB1, SERPINF1, SCT 
##     PADI4, JCHAIN, PACSIN1, PROC, S100A8, CYP1B1, VNN2, ALOX5AP, PTCRA, ASIP 
##     LINC00996, VCAN, ITM2C, KCNK17, DNASE1L3, EPHB1, LAMP5, MAP1A, RBP7, APP 
## Negative:  C1QA, BATF3, TCF7L2, CDKN1C, CTSL, HES4, SIGLEC10, ABI3, RHOC, FCGR3A 
##     HLA-DQB1, MTSS1, CSF1R, CAMK1, IFITM3, CLEC10A, LY6E, RRAS, HLA-DPA1, ENHO 
##     HLA-DQA1, AC064805.1, ZNF703, HLA-DPB1, YBX1, CLIC2, CKB, CXCL16, NR4A1, GBP1
print(pbmc[["pca"]], dims = 1:5, nfeatures = 5)
## PC_ 1 
## Positive:  CD3D, TRAC, LTB, TRBC2, CD3G 
## Negative:  LYZ, FCN1, CST3, MNDA, CTSS 
## PC_ 2 
## Positive:  NKG7, CST7, GZMA, PRF1, KLRD1 
## Negative:  CD79A, MS4A1, BANK1, IGHM, HLA-DQA1 
## PC_ 3 
## Positive:  GZMB, NKG7, GNLY, CLIC3, PRF1 
## Negative:  IL7R, MAL, TCF7, LEF1, TRABD2A 
## PC_ 4 
## Positive:  MS4A1, CD79A, LINC00926, BANK1, TNFRSF13C 
## Negative:  PLD4, FCER1A, SERPINF1, IL3RA, LILRA4 
## PC_ 5 
## Positive:  TPM2, LILRA4, CLEC4C, S100A12, SMPD3 
## Negative:  C1QA, BATF3, TCF7L2, CDKN1C, CTSL
VizDimLoadings(pbmc, dims = 1:2, reduction = "pca")

DimPlot(pbmc, reduction = "pca")

DimHeatmap(pbmc, dims = 1, cells = 500, balanced = TRUE)

DimHeatmap(pbmc, dims = 1:15, cells = 500, balanced = TRUE)

# NOTE: This process can take a long time for big datasets, comment out for expediency. More
# approximate techniques such as those implemented in ElbowPlot() can be used to reduce
# computation time
pbmc <- JackStraw(pbmc, num.replicate = 100)
pbmc <- ScoreJackStraw(pbmc, dims = 1:20)
JackStrawPlot(pbmc, dims = 1:20)
## Warning: Removed 28283 rows containing missing values (geom_point).

ElbowPlot(pbmc)

pbmc <- FindNeighbors(pbmc, dims = 1:5)
## Computing nearest neighbor graph
## Computing SNN
pbmc <- FindClusters(pbmc, resolution = 0.5)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 4084
## Number of edges: 123986
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8984
## Number of communities: 11
## Elapsed time: 0 seconds
head(Idents(pbmc), 5)
## AAACCCAAGAGACAAG-1 AAACCCAAGGCCTAGA-1 AAACCCAGTCGTGCCA-1 
##                  2                  0                  1 
## AAACCCATCGTGCATA-1 AAACGAAAGACAAGCC-1 
##                  1                  6 
## Levels: 0 1 2 3 4 5 6 7 8 9 10
# If you haven't installed UMAP, you can do so via reticulate::py_install(packages =
# 'umap-learn')
pbmc <- RunTSNE(pbmc, dims = 1:20)
# pbmc <- RunUMAP(pbmc, dims = 1:10)
# note that you can set `label = TRUE` or use the LabelClusters function to help label
# individual clusters
DimPlot(pbmc, reduction = "tsne")

DimPlot(pbmc, reduction = "pca")

saveRDS(pbmc, file = "pbmc_tutorial.rds")
# find all markers of cluster 1
cluster1.markers <- FindMarkers(pbmc, ident.1 = 1, min.pct = 0.25)
head(cluster1.markers, n = 5)
##              p_val avg_logFC pct.1 pct.2     p_val_adj
## IL7R 3.860251e-200 1.3177679 0.889 0.359 6.866229e-196
## IL32 3.722698e-197 1.1168309 0.957 0.396 6.621562e-193
## TRAC 2.692170e-147 0.8915222 0.932 0.430 4.788564e-143
## CD2  8.540231e-131 0.6987309 0.849 0.359 1.519051e-126
## LTB  6.242477e-123 0.9596513 0.963 0.640 1.110349e-118
# find all markers distinguishing cluster 5 from clusters 0 and 3
cluster5.markers <- FindMarkers(pbmc, ident.1 = 5, ident.2 = c(0,3), min.pct = 0.25)
head(cluster5.markers, n = 5)
##          p_val avg_logFC pct.1 pct.2 p_val_adj
## CD79A        0  2.538752 0.990 0.025         0
## MS4A1        0  2.435961 0.970 0.011         0
## HLA-DQA1     0  2.429548 0.973 0.006         0
## HLA-DQB1     0  2.067034 0.943 0.015         0
## BANK1        0  1.840449 0.885 0.001         0
# find markers for every cluster compared to all remaining cells, report only the positive ones
pbmc.markers <- FindAllMarkers(pbmc, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25)
## Calculating cluster 0
## Calculating cluster 1
## Calculating cluster 2
## Calculating cluster 3
## Calculating cluster 4
## Calculating cluster 5
## Calculating cluster 6
## Calculating cluster 7
## Calculating cluster 8
## Calculating cluster 9
## Calculating cluster 10
pbmc.markers %>% group_by(cluster) %>% top_n(n = 2, wt = avg_logFC)
## # A tibble: 22 x 7
## # Groups:   cluster [11]
##        p_val avg_logFC pct.1 pct.2 p_val_adj cluster gene  
##        <dbl>     <dbl> <dbl> <dbl>     <dbl> <fct>   <chr> 
##  1 0.            1.02  0.68  0.095 0.        0       CCR7  
##  2 3.53e-293     0.997 0.957 0.66  6.28e-289 0       LDHB  
##  3 3.86e-200     1.32  0.889 0.359 6.87e-196 1       IL7R  
##  4 3.86e-117     1.20  0.396 0.072 6.86e-113 1       GZMK  
##  5 0.            2.16  1     0.374 0.        2       S100A9
##  6 0.            1.99  1     0.266 0.        2       S100A8
##  7 3.22e-258     1.72  0.995 0.211 5.72e-254 3       NKG7  
##  8 1.79e-243     1.86  0.956 0.217 3.18e-239 3       CCL5  
##  9 1.17e-204     1.25  0.991 0.266 2.08e-200 4       CPVL  
## 10 4.95e-165     1.35  0.908 0.247 8.81e-161 4       IL1B  
## # … with 12 more rows
cluster1.markers <- FindMarkers(pbmc, ident.1 = 0, logfc.threshold = 0.25, test.use = "roc", only.pos = TRUE)
VlnPlot(pbmc, features = c("MS4A1", "CD79A"))

VlnPlot(pbmc, features = c("NKG7", "PF4"), slot = "counts", log = TRUE)

FeaturePlot(pbmc, features = c("MS4A1", "GNLY", "CD3E", "CD14", "FCER1A", "FCGR3A", "LYZ", "PPBP", 
    "CD8A"))

top10 <- pbmc.markers %>% group_by(cluster) %>% top_n(n = 10, wt = avg_logFC)
DoHeatmap(pbmc, features = top10$gene) + NoLegend()

THIS HAS TO BE CHANGED manually!!!

new.cluster.ids <- c("Dendritic", "NK", "Naive CD4 T", "Memory CD4 T", "CD14+ Mono", "B", "CD8 T", "FCGR3A+ Mono", 
    "NK", "DC", "Platelet")
names(new.cluster.ids) <- levels(pbmc)
pbmc <- RenameIdents(pbmc, new.cluster.ids)
DimPlot(pbmc, reduction = "tsne", label = TRUE, pt.size = 0.5) + NoLegend()

saveRDS(pbmc, file = "pbmc3k_final.rds")